3. Les parlementaires sur le réseau social Twitter#
3.1. Analyse des données textuelles#
import pandas as pd
from lib.figures import *
from lib.constant import *
from lib.utils import *
from bokeh.io import output_notebook
output_notebook(hide_banner=True)
seed = 42
twitter_df = pd.read_parquet('data/twitter_fev_to_juin_2023_retraite_data.parquet_v2')
intervention_frequency_per_group(twitter_df)
Show code cell source
from bokeh.models import TabPanel, Tabs
df = getCountDataframe(twitter_df,top_n=10)
tab1 = TabPanel(child=occurrenceDistributionPerGroupePolitique(df[df.num_words == 1]), title="1 mot")
tab2 = TabPanel(child=occurrenceDistributionPerGroupePolitique(df[df.num_words == 2]), title="2 mots")
tab3 = TabPanel(child=occurrenceDistributionPerGroupePolitique(df[df.num_words == 3]), title="3 mots")
show(Tabs(tabs=[tab1, tab2,tab3],sizing_mode ="stretch_width"))
from bokeh.models import TabPanel, Tabs
df = getCountDataframe(twitter_df,top_n=10)
tab1 = TabPanel(child=occurrenceDistributionPerPolitiqueOrientation(df[df.num_words == 1]), title="1 mot")
tab2 = TabPanel(child=occurrenceDistributionPerPolitiqueOrientation(df[df.num_words == 2]), title="2 mots")
tab3 = TabPanel(child=occurrenceDistributionPerPolitiqueOrientation(df[df.num_words == 3]), title="3 mots")
show(Tabs(tabs=[tab1, tab2,tab3],sizing_mode ="stretch_width"))
3.2. Network Data#
twitter_df = pd.read_parquet('data/twitter_fev_to_juin_2023_retraite_data.parquet_v2')
twitter_df = twitter_df[~(twitter_df.retweet_id == None)]
twitter_df= twitter_df[twitter_df.is_keywords]
deputy_df = pd.read_csv("data/nosdeputes.fr_deputes_en_mandat_2023-08-02.csv",sep=";")
slug2twitterat = dict(deputy_df["slug twitter".split()].values)
twitter_df["twitter_at"] = twitter_df.username.map(slug2twitterat)
twitter_df.head(2)
| username | full_text | date | in_reply_to_screen_name | in_reply_to_status_id_str | in_reply_to_user_id_str | retweet_id | retweet_username | retweet_user_id | is_quote_status | quoted_status_id_str | groupe_sigle | hashtag | is_hashtag | lemmatization | keywords_detected | is_keywords | twitter_at | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 464 | jean-luc-fugit | RT : La réforme des retraites soulève la quest... | 2023-02-01 00:00:59+00:00 | None | None | None | 1620569649967681542 | StanGuerini | 1911591212 | False | None | REN | [#retraites] | True | rt : le réforme de retraite soulever le questi... | [retraite, retrait, réforme, réforme de retrai... | True | Jean_LucFUGIT |
| 453 | laure-lavalette | RT : . (RN) interpelle (LFI) : "On ne comprend... | 2023-02-01 00:19:24+00:00 | None | None | None | 1620535420223213569 | LCP | 85362553 | False | None | RN | [] | False | rt : . ( RN ) interpelle ( LFI ) : " on ne com... | [obstruction, majorité] | True | LaureLavalette |
deputy_df["color"] = deputy_df.groupe_sigle.map(gp_politique_color)
dep2color = dict(deputy_df["twitter color".split()].values)
dep2sigle = dict(deputy_df["twitter groupe_sigle".split()].values)
def color(node):
if not node in dep2color:
return "#aaa"
return dep2color[node]
def gp_legend(node):
if not node in dep2sigle:
return "NA"
return dep2sigle[node]
import networkx as nx
graph_df = twitter_df["twitter_at retweet_username groupe_sigle".split()].astype(str)
graph_df = graph_df[~(graph_df.isna())]
#graph_df = graph_df[graph_df.retweet_username.isin(deputy_df.twitter.values)]
graph_df = graph_df.groupby("twitter_at retweet_username".split(),as_index=False).size()
G = nx.from_pandas_edgelist(graph_df,source="twitter_at",target="retweet_username",edge_attr="size",create_using=nx.DiGraph)
for node in list(G.nodes()):
if G.degree(node)<4:
G.remove_node(node)
if "None" in G: G.remove_node("None")
from ipysigma import SigmaGrid
betweeness = nx.betweenness_centrality(G)
page_rank = nx.pagerank(G)
SigmaGrid(G,hide_search=False,columns=2).add(node_size=G.in_degree,name="In Degree",
node_color=dep2sigle,
default_node_border_color="#ffffff",
node_color_palette=gp_politique_color,
node_label_size=G.degree,
node_size_range=[3,20],
start_layout=10, default_edge_type="curve",
label_font="Arial",
edge_size_range=[0.1,1])\
.add(node_size=lambda x:betweeness[x],
node_color=dep2sigle,
default_node_border_color="#ffffff",
node_color_palette=gp_politique_color,
node_label_size=lambda x:betweeness[x],
start_layout=10, default_edge_type="curve",
label_font="Arial",
edge_size_range=[1,5],node_size_range=[3,20],name="Betweeness")\
.add(node_size=lambda x:page_rank[x],
node_color=dep2sigle,
default_node_border_color="#ffffff",
node_color_palette=gp_politique_color,
node_label_size=lambda x:page_rank[x],
start_layout=10, default_edge_type="curve",
label_font="Arial",
edge_size_range=[1,5],node_size_range=[3,20],name="Page Rank")
3.2.1. Content similarity#
dataset_df = pd.read_parquet('data/twitter_fev_to_juin_2023_retraite_data.parquet_v2')
dataset_df = dataset_df.drop_duplicates("full_text")
dataset_df = dataset_df[~dataset_df.full_text.apply(lambda x : x.startswith("RT"))]
dataset_df = dataset_df[dataset_df.is_keywords]
dataset_df.head()
| username | full_text | date | in_reply_to_screen_name | in_reply_to_status_id_str | in_reply_to_user_id_str | retweet_id | retweet_username | retweet_user_id | is_quote_status | quoted_status_id_str | groupe_sigle | hashtag | is_hashtag | lemmatization | keywords_detected | is_keywords | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 375 | mathieu-lefevre | Plus les impôts baissent et plus les recettes ... | 2023-02-01 06:32:02+00:00 | None | None | None | None | None | None | False | None | REN | [] | False | plus le impôt baisser et plus le recette de l’... | [travail, recette, courage, impôt] | True |
| 682 | frederic-boccaletti | Mme , "apparemment il y a une partie du foncti... | 2023-02-01 06:50:58+00:00 | None | None | None | None | None | None | False | None | RN | [#motionreferendaire] | True | mme , " apparemment il y avoir un partie de fo... | [pouvoir] | True |
| 488 | philippe-brun | Le prix de l'énergie est un élément essentiel ... | 2023-02-01 07:02:50+00:00 | None | None | None | None | None | None | False | None | SOC | [] | False | le prix de le énergie être un élément essentie... | [entreprise, compétitivité, patron] | True |
| 326 | kevin-mauvieux | Comprenez : « jamais nous ne défendrons la Fra... | 2023-02-01 07:10:59+00:00 | None | None | None | None | None | None | True | 1620493775515828226 | RN | [#NonALaReformeDesRetraites] | True | Comprenez : « jamais nous ne défendre le Franc... | [vote, retraite, retrait, ratio, français, Fra... | True |
| 453 | gregoire-de-fournas | Des centaines d'amendements de la NUPES ont ét... | 2023-02-01 07:14:30+00:00 | None | None | None | None | None | None | True | 1620493775515828226 | RN | [] | False | un centaine de amendement de le NUPES avoir êt... | [vote, ratio] | True |
from sklearn.metrics.pairwise import cosine_similarity
import gensim
from umap import UMAP
from ipysigma import Sigma
count_tweet = dict(dataset_df.groupby("username").size())
user2groupe = dict(dataset_df["username groupe_sigle".split()].values)
df_corpus = dataset_df.groupby("username",as_index=False).agg({"lemmatization":lambda x : " ".join(x)})
df_corpus["groupe_sigle"] = df_corpus["username"].map(dep2sigle)
corpus = df_corpus.apply(lambda doc: gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(doc.lemmatization), [doc.username]),axis=1)
model = gensim.models.doc2vec.Doc2Vec(vector_size=64, min_count=2, epochs=40,dm=1)
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)
user_vec = model.dv.vectors
user_vec_reduced = UMAP(random_state=seed).fit_transform(user_vec)
# fig = px.scatter(x=user_vec_reduced[:,0],y=user_vec_reduced[:,1],color=[user2groupe[user] for user in model.dv.index_to_key],size=[count_tweet[user] for user in model.dv.index_to_key],color_discrete_map=gp_politique_color,
# text= model.dv.index_to_key, height=1000,opacity=0.9,size_max=40)
# fig
G = nx.Graph()
for ix,node in enumerate(model.dv.index_to_key):
G.add_node(node)
size_func = lambda x:count_tweet[x]
Sigma(graph=G,layout={node:{"x":user_vec_reduced[ix,0],"y":user_vec_reduced[ix,1]}for ix,node in enumerate(model.dv.index_to_key)},
node_size=size_func,node_color_palette=gp_politique_color,
node_color=user2groupe,default_node_border_color="#efefef",hide_search=True,
node_label_size=size_func)